
*********************************************
/* Set up the log*/
cd $ohie
cap log close
global sysdate: disp %tdYYNNDD  date("`c(current_date)'", "DMY")
qui log using 	"./logs/late_reweighting_$sysdate.log", replace

/*----------------------------------------------------------------------*/
/* PROGRAM: late_reweighting.do						*/
/*									*/
/* PURPOSE:								*/
/* [*]	This code calculates LATE in the Oregon sample reweighted to    */
/*      reflect the distribution of covariates in Massachusetts. I first*/
/*      complete this exercise on age x sex x lanugage, and then on self*/
/*      reported health.						*/
/*									*/
/* OUTPUT:								*/
/* [*]	late_reweighting.xls: This file contains the re-weighted  */
/*	LATEs under a variety of specifications				*/
/*									*/
/*----------------------------------------------------------------------*/

* Set up the main Stata settings 
clear
clear mata
set matsize 3000
set type double

*********************************************
* RUN CODE
*********************************************

*First, I calculate all the relevant weights and run the relevant regressions in 
*the MA data, before moving on to the OR data

*Load the Mass data
use "$final/brfss.dta", clear

*Make sure no one is missing one of the common covariates
assert age!=. & female!=. & english!=.

*Save the key parameters as macros for calculating covariate dist among compliers.
* Calculate pC
* pC = P(D=1|Z=0)
su D if Z==0 [aweight=w]
local pC = `r(mean)'
		
* Calculate pI
* pI = P(D=1|Z=1)
su D if Z==1 [aweight=w]
local pI = `r(mean)'

qui: summ Z [w=w]
local prZ = `r(mean)'

*I calculate two versions of the mass weights across the covariates age, sex, english

*First, eight discrete bins across three binary variables
*41 is the median age in OR
gen age_median = (age > 41)
sort age_median female english
egen covs = group(age_median female english)
*Use kmeans clustering to choose the 8 bins for LATE reweighting
*cluster kmeans age female english, k(8) gen(covs)

*Save the results by age female english
levelsof age, local(A)
foreach a in `A' {
forv f = 0/1 {
forv E = 0/1 {
qui: summ covs if age==`a' & female==`f' & english==`E'
local covs_`a'_`f'_`E' = r(mean)
}
}
}

*Loop through the vals of covs and calc the dist for compliers
forv i = 1/8 {
	gen cov`i' = (covs == `i')
	forv d = 0/1{
	forv z = 0/1{
		qui: summ cov`i' [w=w] if D==`d' & Z==`z'
		local Ecov`i'`d'`z' = `r(mean)'
	}
	}
	*I take the max with zero because technically we are estimating a LPM where the support is [0, 1] (1 would never be binding!)
	local cov`i'MA =max(0, `prZ'*(`pI'/(`pI'-`pC')*`Ecov`i'11' - `pC'/(`pI'-`pC')*`Ecov`i'10') + (1-`prZ')*((1-`pC')/(`pI'-`pC')*`Ecov`i'00' - (1-`pI')/(`pI'-`pC')*`Ecov`i'01'))
}



********************************************************************************

*Now, I move onto the OR data
*Load the Oregon data
use "$final/oregonnumhh1.dta", clear

*Save the key parameters as macros for calculating covariate dist among compliers.
* pC is defined as P(D=1|Z=0)
qui su any_medicaid if Z == 0 
local pC = `r(mean)'
		
* pI is defined as P(D=1|Z=1)
qui su any_medicaid if Z == 1 
local pI = `r(mean)'
		
qui: summ Z [w=w]
local prZ = `r(mean)'

*Baseline LATE
ivreg Y_num (any_medicaid = Z) [w=w], robust
outreg2 using $output/late_reweighting.xls, excel replace nocons noaster ctitle(ER) addtext(Sample, OR full) 

*Make sure no one is missing one of the common covariates
assert age!=. & female!=. & english!=.

*I calculate two versions of the mass weights across these covariates

*First, eight discrete bins across three binary variables
*41 is the median age in OR
gen age_median = (age > 41)
sort age_median female english

egen covs = group(age_median female english)


gen weight1 = .
*Loop through the vals of covs and calc the dist for compliers
forv i = 1/8 {
        gen cov`i' = (covs == `i')
        forv d = 0/1{
        forv z = 0/1{
                qui: summ cov`i' [w=w] if any_medicaid==`d' & Z==`z'
                local Ecov`i'`d'`z' = `r(mean)'
        }
        }
        local cov`i'OR = max(0,`prZ' *(`pI'/(`pI'-`pC') *`Ecov`i'11' - `pC'/(`pI'-`pC') *`Ecov`i'10') + (1-`prZ') *((1-`pC')/(`pI'-`pC')*`Ecov`i'00' - (1-`pI')/(`pI'-`pC')*`Ecov`i'01'))
	*I take a ratio with the relevant MA local and save as the weight var
	replace weight1 = `cov`i'MA' / `cov`i'OR' if cov`i'==1 & `cov`i'OR'!=0
	*I replace the weight with zero if the OR estimated weight for compliers is zero because these obs have an unidentified LATE
	replace weight1 = 0 if `cov`i'OR'==0 & cov`i'==1
}

*First, I estimate LATE on the reweighted sample
ivreg Y_num (any_medicaid = Z) [w=weight1], robust
outreg2 using $output/late_reweighting.xls, excel append nocons noaster ctitle(ER) addtext(Sample, OR full, Weight vars, age sex engl, Bin count, 8, Method, Sample reweight) 


*Second, I estimate the same LATE in each bucket and then avg by the MA weights
local bLATE2 = 0
forv i = 1/8{
	ivreg Y_num (any_medicaid = Z) if covs==`i', robust
	local bLATE2 = `bLATE2' + (`cov`i'MA')*(_b[any_medicaid])
}	

*display the locals to add to spreadsheet manually
dis "LATE reweighting 8 bins is `bLATE2'"

*Erase ancillary txt file outreg2 created
capture erase $output/late_reweighting.txt

capture log close
